Linear Regression

The Data

life <- read.csv("../Data/clean/life_clean.csv")
head(life)
##   X   age gender weight_kg height_m max_bpm avg_bpm resting_bpm
## 1 0 34.91      0     65.27     1.62  188.58  157.65       69.05
## 2 1 23.37      1     56.41     1.55  179.43  131.75       73.18
## 3 2 33.20      1     58.98     1.67  175.04  123.95       54.96
## 4 3 38.69      1     93.78     1.70  191.21  155.10       50.07
## 5 4 45.09      0     52.42     1.88  193.58  152.88       70.84
## 6 5 53.19      1    105.05     1.84  176.52  130.60       61.84
##   session_duration_hours calories_burned workout_type fat_percentage
## 1                   1.00         1080.90            0       26.80038
## 2                   1.37         1809.91            1       27.65502
## 3                   0.91          802.26            2       24.32082
## 4                   1.10         1450.79            1       32.81357
## 5                   1.08         1166.40            0       17.30732
## 6                   0.69          453.33            3       32.04906
##   water_intake_liters workout_frequency_days_week experience_level   bmi
## 1                1.50                        3.99             2.01 24.87
## 2                1.90                        4.00             2.01 23.48
## 3                1.88                        2.99             1.02 21.15
## 4                2.50                        3.99             1.99 32.45
## 5                2.91                        4.00             2.00 14.83
## 6                2.91                        3.02             1.00 31.03
##   daily_meals_frequency physical_exercise  carbs proteins  fats calories
## 1                  2.99              0.01 267.68   106.05 71.63     1806
## 2                  3.01              0.97 214.32    85.41 56.97     1577
## 3                  1.99             -0.02 246.04    98.11 65.48     1608
## 4                  3.00              0.04 203.22    80.84 54.56     2657
## 5                  3.00              3.00 332.79   133.05 88.43     1470
## 6                  2.99             -0.04 170.86    67.92 46.06     2767
##   meal_type diet_type sugar_g sodium_mg cholesterol_mg serving_size_g
## 1         0         0   31.77   1729.94         285.05         120.47
## 2         0         1   12.34    693.08         300.61         109.15
## 3         1         2   42.81   2142.48         215.42         399.43
## 4         0         2    9.34    123.20           9.70         314.31
## 5         1         0   23.78   1935.11         116.89          99.22
## 6         2         3   15.89   2382.39          36.38         416.54
##   cooking_method prep_time_min cook_time_min rating name_of_exercise sets  reps
## 1              0         16.24        110.79   1.31                0 4.99 20.91
## 2              1         16.47         12.01   1.92                1 4.01 16.15
## 3              2         54.35          6.09   4.70                2 5.00 21.90
## 4              1         27.73        103.72   4.85                3 4.01 16.92
## 5              3         34.16         46.55   3.07                4 4.99 15.01
## 6              4         20.98         54.64   3.38                5 4.00 25.10
##   benefit burns_calories_per_30min target_muscle_group equipment_needed
## 1       0                   342.58                   0                0
## 2       1                   357.16                   1                1
## 3       2                   359.63                   2                1
## 4       3                   351.65                   3                2
## 5       4                   329.36                   4                3
## 6       5                   374.56                   5                4
##   difficulty_level body_part type_of_muscle workout bmi_calc cal_from_macros
## 1                0         0              0       0 24.87045         2139.59
## 2                1         1              0       1 23.47971         1711.65
## 3                1         2              1       2 21.14812         1965.92
## 4                0         3              2       3 32.44983         1627.28
## 5                0         4              3       4 14.83137         2659.23
## 6                2         2              3       5 31.02847         1369.66
##   pct_carbs protein_per_kg   pct_hrr pct_maxhr cal_balance lean_mass_kg
## 1 0.5004323      1.6247893 0.7412365 0.8359847      725.10     47.77739
## 2 0.5008501      1.5140932 0.5512471 0.7342696     -232.91     40.80980
## 3 0.5006104      1.6634452 0.5745336 0.7081239      805.74     44.63558
## 4 0.4995330      0.8620175 0.7441547 0.8111500     1206.21     63.00743
## 5 0.5005810      2.5381534 0.6684048 0.7897510      303.60     43.34750
## 6 0.4989851      0.6465493 0.5995814 0.7398595     2313.67     71.38246
##   expected_burn burns_calories_.per_30_min._bc burns_calories_bin
## 1      685.1600                   7.260425e+19                  0
## 2      978.6184                   1.020506e+20                  1
## 3      654.5266                   1.079607e+20                  1
## 4      773.6300                   8.987921e+19                  1
## 5      711.4176                   5.264685e+19                  2
## 6      516.8928                   1.505159e+20                  3
colnames(life)
##  [1] "X"                              "age"                           
##  [3] "gender"                         "weight_kg"                     
##  [5] "height_m"                       "max_bpm"                       
##  [7] "avg_bpm"                        "resting_bpm"                   
##  [9] "session_duration_hours"         "calories_burned"               
## [11] "workout_type"                   "fat_percentage"                
## [13] "water_intake_liters"            "workout_frequency_days_week"   
## [15] "experience_level"               "bmi"                           
## [17] "daily_meals_frequency"          "physical_exercise"             
## [19] "carbs"                          "proteins"                      
## [21] "fats"                           "calories"                      
## [23] "meal_type"                      "diet_type"                     
## [25] "sugar_g"                        "sodium_mg"                     
## [27] "cholesterol_mg"                 "serving_size_g"                
## [29] "cooking_method"                 "prep_time_min"                 
## [31] "cook_time_min"                  "rating"                        
## [33] "name_of_exercise"               "sets"                          
## [35] "reps"                           "benefit"                       
## [37] "burns_calories_per_30min"       "target_muscle_group"           
## [39] "equipment_needed"               "difficulty_level"              
## [41] "body_part"                      "type_of_muscle"                
## [43] "workout"                        "bmi_calc"                      
## [45] "cal_from_macros"                "pct_carbs"                     
## [47] "protein_per_kg"                 "pct_hrr"                       
## [49] "pct_maxhr"                      "cal_balance"                   
## [51] "lean_mass_kg"                   "expected_burn"                 
## [53] "burns_calories_.per_30_min._bc" "burns_calories_bin"
# dropping the index column and mutating
life <- life %>%
    dplyr::select(-X) %>%
    dplyr::mutate(across(where(is.character), as.factor))

glimpse(life)
## Rows: 20,000
## Columns: 53
## $ age                            <dbl> 34.91, 23.37, 33.20, 38.69, 45.09, 53.1…
## $ gender                         <int> 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ weight_kg                      <dbl> 65.27, 56.41, 58.98, 93.78, 52.42, 105.…
## $ height_m                       <dbl> 1.62, 1.55, 1.67, 1.70, 1.88, 1.84, 1.7…
## $ max_bpm                        <dbl> 188.58, 179.43, 175.04, 191.21, 193.58,…
## $ avg_bpm                        <dbl> 157.65, 131.75, 123.95, 155.10, 152.88,…
## $ resting_bpm                    <dbl> 69.05, 73.18, 54.96, 50.07, 70.84, 61.8…
## $ session_duration_hours         <dbl> 1.00, 1.37, 0.91, 1.10, 1.08, 0.69, 1.6…
## $ calories_burned                <dbl> 1080.90, 1809.91, 802.26, 1450.79, 1166…
## $ workout_type                   <int> 0, 1, 2, 1, 0, 3, 0, 3, 0, 0, 0, 1, 1, …
## $ fat_percentage                 <dbl> 26.80038, 27.65502, 24.32082, 32.81357,…
## $ water_intake_liters            <dbl> 1.50, 1.90, 1.88, 2.50, 2.91, 2.91, 2.7…
## $ workout_frequency_days_week    <dbl> 3.99, 4.00, 2.99, 3.99, 4.00, 3.02, 4.9…
## $ experience_level               <dbl> 2.01, 2.01, 1.02, 1.99, 2.00, 1.00, 3.0…
## $ bmi                            <dbl> 24.87, 23.48, 21.15, 32.45, 14.83, 31.0…
## $ daily_meals_frequency          <dbl> 2.99, 3.01, 1.99, 3.00, 3.00, 2.99, 2.0…
## $ physical_exercise              <dbl> 0.01, 0.97, -0.02, 0.04, 3.00, -0.04, -…
## $ carbs                          <dbl> 267.68, 214.32, 246.04, 203.22, 332.79,…
## $ proteins                       <dbl> 106.05, 85.41, 98.11, 80.84, 133.05, 67…
## $ fats                           <dbl> 71.63, 56.97, 65.48, 54.56, 88.43, 46.0…
## $ calories                       <int> 1806, 1577, 1608, 2657, 1470, 2767, 186…
## $ meal_type                      <int> 0, 0, 1, 0, 1, 2, 1, 2, 0, 0, 1, 2, 3, …
## $ diet_type                      <int> 0, 1, 2, 2, 0, 3, 4, 4, 0, 2, 3, 3, 0, …
## $ sugar_g                        <dbl> 31.77, 12.34, 42.81, 9.34, 23.78, 15.89…
## $ sodium_mg                      <dbl> 1729.94, 693.08, 2142.48, 123.20, 1935.…
## $ cholesterol_mg                 <dbl> 285.05, 300.61, 215.42, 9.70, 116.89, 3…
## $ serving_size_g                 <dbl> 120.47, 109.15, 399.43, 314.31, 99.22, …
## $ cooking_method                 <int> 0, 1, 2, 1, 3, 4, 1, 5, 2, 0, 3, 0, 3, …
## $ prep_time_min                  <dbl> 16.24, 16.47, 54.35, 27.73, 34.16, 20.9…
## $ cook_time_min                  <dbl> 110.79, 12.01, 6.09, 103.72, 46.55, 54.…
## $ rating                         <dbl> 1.31, 1.92, 4.70, 4.85, 3.07, 3.38, 3.8…
## $ name_of_exercise               <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1…
## $ sets                           <dbl> 4.99, 4.01, 5.00, 4.01, 4.99, 4.00, 5.0…
## $ reps                           <dbl> 20.91, 16.15, 21.90, 16.92, 15.01, 25.1…
## $ benefit                        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 4, 9, 10,…
## $ burns_calories_per_30min       <dbl> 342.58, 357.16, 359.63, 351.65, 329.36,…
## $ target_muscle_group            <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 4, 11…
## $ equipment_needed               <int> 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 3, 9, 10,…
## $ difficulty_level               <int> 0, 1, 1, 0, 0, 2, 0, 1, 2, 1, 1, 0, 0, …
## $ body_part                      <int> 0, 1, 2, 3, 4, 2, 3, 5, 1, 6, 4, 3, 4, …
## $ type_of_muscle                 <int> 0, 0, 1, 2, 3, 3, 3, 4, 5, 6, 7, 8, 6, …
## $ workout                        <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1…
## $ bmi_calc                       <dbl> 24.87045, 23.47971, 21.14812, 32.44983,…
## $ cal_from_macros                <dbl> 2139.59, 1711.65, 1965.92, 1627.28, 265…
## $ pct_carbs                      <dbl> 0.5004323, 0.5008501, 0.5006104, 0.4995…
## $ protein_per_kg                 <dbl> 1.6247893, 1.5140932, 1.6634452, 0.8620…
## $ pct_hrr                        <dbl> 0.7412365, 0.5512471, 0.5745336, 0.7441…
## $ pct_maxhr                      <dbl> 0.8359847, 0.7342696, 0.7081239, 0.8111…
## $ cal_balance                    <dbl> 725.10, -232.91, 805.74, 1206.21, 303.6…
## $ lean_mass_kg                   <dbl> 47.77739, 40.80980, 44.63558, 63.00743,…
## $ expected_burn                  <dbl> 685.1600, 978.6184, 654.5266, 773.6300,…
## $ burns_calories_.per_30_min._bc <dbl> 7.260425e+19, 1.020506e+20, 1.079607e+2…
## $ burns_calories_bin             <int> 0, 1, 1, 1, 2, 3, 2, 3, 1, 2, 1, 3, 1, …

Variables for each model

exer <- c("session_duration_hours", "max_bpm", "avg_bpm", "resting_bpm", "workout_frequency_days_week",
    "experience_level", "sets", "reps", "workout_type", "difficulty_level", "workout",
    "target_muscle_group", "body_part", "type_of_muscle")

diet <- c("carbs", "proteins", "fats", "calories", "sugar_g", "sodium_mg", "cholesterol_mg",
    "meal_type", "diet_type", "daily_meals_frequency", "serving_size_g", "cooking_method",
    "prep_time_min", "cook_time_min", "pct_carbs", "protein_per_kg", "cal_from_macros",
    "cal_balance")

combined <- union(exer, diet)
setdiff(exer, names(life))
## character(0)
setdiff(diet, names(life))
## character(0)

Modeling

# modeling
make_formula <- function(y, xs) {
    as.formula(paste(y, "~", paste(xs, collapse = " + ")))
}

f_exer <- make_formula("burns_calories_per_30min", exer)
f_diet <- make_formula("burns_calories_per_30min", diet)
f_combined <- make_formula("burns_calories_per_30min", combined)

lm_exer <- lm(f_exer, data = life)
lm_diet <- lm(f_diet, data = life)
lm_combined <- lm(f_combined, data = life)
broom::glance(lm_exer)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df  logLik     AIC     BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1     0.287         0.286  27.2      574.       0    14 -94402. 188836. 188963.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::glance(lm_diet)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df  logLik     AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>   <dbl>   <dbl>  <dbl>
## 1    0.0258        0.0250  31.7      31.2 2.88e-100    17 -97518. 195074. 1.95e5
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
broom::glance(lm_combined)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df  logLik     AIC     BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>   <dbl>   <dbl>   <dbl>
## 1     0.306         0.305  26.8      284.       0    31 -94123. 188313. 188574.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# combine model stats

model_list <- list(
  exercise = lm_exer,
  diet = lm_diet,
  combined = lm_combined
)

library(broom)
library(dplyr)
library(purrr)

model_comp <- model_list %>%
  imap_dfr(~ glance(.x) %>% mutate(model = .y),.id = NULL) %>%
  dplyr::select(model, r.squared, adj.r.squared, sigma,          # residual std error 
                statistic,      # F-statistic 
                p.value,        # global F-test p-value
                AIC, BIC) %>%
  arrange(desc(adj.r.squared))

model_comp
## # A tibble: 3 × 8
##   model    r.squared adj.r.squared sigma statistic   p.value     AIC     BIC
##   <chr>        <dbl>         <dbl> <dbl>     <dbl>     <dbl>   <dbl>   <dbl>
## 1 combined    0.306         0.305   26.8     284.  0         188313. 188574.
## 2 exercise    0.287         0.286   27.2     574.  0         188836. 188963.
## 3 diet        0.0258        0.0250  31.7      31.2 2.88e-100 195074. 195224.
rmse <- function(mod) {
    sqrt(mean(residuals(mod)^2))
}

rmse_tbl <- model_list %>%
    imap_dfr(~tibble(model = .y, rmse = rmse(.x)))

rmse_tbl
## # A tibble: 3 × 2
##   model     rmse
##   <chr>    <dbl>
## 1 exercise  27.1
## 2 diet      31.7
## 3 combined  26.8
# merging tables

model_summary <- model_comp %>%
    left_join(rmse_tbl, by = "model")

model_summary
## # A tibble: 3 × 9
##   model    r.squared adj.r.squared sigma statistic   p.value    AIC    BIC  rmse
##   <chr>        <dbl>         <dbl> <dbl>     <dbl>     <dbl>  <dbl>  <dbl> <dbl>
## 1 combined    0.306         0.305   26.8     284.  0         1.88e5 1.89e5  26.8
## 2 exercise    0.287         0.286   27.2     574.  0         1.89e5 1.89e5  27.1
## 3 diet        0.0258        0.0250  31.7      31.2 2.88e-100 1.95e5 1.95e5  31.7

Plotting

plot_diagnostics <- function(model, title_prefix = "") {

    aug <- augment(model)

    p1 <- ggplot(aug, aes(.fitted, .resid)) + geom_point(alpha = 0.4) + geom_hline(yintercept = 0,
        linetype = "dashed") + labs(title = paste0(title_prefix, "Residuals vs Fitted"),
        x = "Fitted Values", y = "Residuals") + theme_minimal()

    p2 <- ggplot(aug, aes(sample = .resid)) + stat_qq(alpha = 0.4) + stat_qq_line() +
        labs(title = paste0(title_prefix, "QQ Plot of Residuals"), x = "Theoretical Quantiles",
            y = "Sample Quantiles") + theme_minimal()

    p3 <- ggplot(aug, aes(.hat, .cooksd)) + geom_point(alpha = 0.4) + labs(title = paste0(title_prefix,
        "Leverage vs Cook's Distance"), x = "Leverage (Hat Values)", y = "Cook's Distance") +
        theme_minimal()

    list(residuals_plot = p1, qq_plot = p2, leverage_plot = p3)
}
# exercise diagnositic
diag_ex <- plot_diagnostics(lm_exer, "Exercise Model: ")
diag_ex$residuals_plot

diag_ex$qq_plot

diag_ex$leverage_plot

# diet diagnostic
diag_diet <- plot_diagnostics(lm_diet, "Diet Model: ")
diag_diet
## $residuals_plot

## 
## $qq_plot

## 
## $leverage_plot

# combined diagnostic
diag_combined <- plot_diagnostics(lm_combined, "Combined Model: ")
diag_combined
## $residuals_plot

## 
## $qq_plot

## 
## $leverage_plot

# predicted regression on exercise augment adds fitted values and residuals
aug_exer <- augment(lm_exer)

ggplot(aug_exer, aes(x = .fitted, y = burns_calories_per_30min)) + geom_point(alpha = 0.4,
    color = "steelblue") + geom_abline(slope = 1, intercept = 0, linetype = "dashed",
    color = "red") + labs(title = "Exercise Model: Predicted vs Actual", x = "Predicted Calorie Burn",
    y = "Actual Calorie Burn") + theme_minimal()

aug_diet <- augment(lm_diet)

ggplot(aug_diet, aes(x = .fitted, y = burns_calories_per_30min)) + geom_point(alpha = 0.4,
    color = "steelblue") + geom_abline(slope = 1, intercept = 0, linetype = "dashed",
    color = "red") + labs(title = "Diet Model: Predicted vs Actual", x = "Predicted Calorie Burn",
    y = "Actual Calorie Burn") + theme_minimal()

aug_combined <- augment(lm_combined)

ggplot(aug_combined, aes(x = .fitted, y = burns_calories_per_30min)) + geom_point(alpha = 0.4,
    color = "steelblue") + geom_abline(slope = 1, intercept = 0, linetype = "dashed",
    color = "red") + labs(title = "Exercise & Diet Model: Predicted vs Actual", x = "Predicted Calorie Burn",
    y = "Actual Calorie Burn") + theme_minimal()

# p-values for predictors separately
tidy(lm_exer) %>%
    arrange(p.value)
## # A tibble: 15 × 5
##    term                          estimate std.error statistic  p.value
##    <chr>                            <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                 212.          4.48     47.4    0       
##  2 sets                         22.5         0.340    66.3    0       
##  3 reps                          2.23        0.0535   41.6    0       
##  4 session_duration_hours       -7.56        0.865    -8.73   2.75e-18
##  5 avg_bpm                      -0.107       0.0135   -7.93   2.32e-15
##  6 experience_level              4.25        0.563     7.55   4.51e-14
##  7 resting_bpm                   0.167       0.0264    6.30   2.99e-10
##  8 workout_frequency_days_week  -0.959       0.385    -2.49   1.28e- 2
##  9 type_of_muscle               -0.116       0.0514   -2.26   2.36e- 2
## 10 difficulty_level              0.276       0.235     1.17   2.42e- 1
## 11 body_part                     0.102       0.0959    1.06   2.87e- 1
## 12 max_bpm                      -0.00733     0.0168   -0.437  6.62e- 1
## 13 workout_type                  0.0742      0.171     0.434  6.65e- 1
## 14 target_muscle_group           0.00623     0.0184    0.339  7.35e- 1
## 15 workout                       0.000891    0.0125    0.0710 9.43e- 1
tidy(lm_diet) %>%
    arrange(p.value)
## # A tibble: 19 × 5
##    term                     estimate  std.error statistic   p.value
##    <chr>                       <dbl>      <dbl>     <dbl>     <dbl>
##  1 sugar_g                 -0.197      0.0156     -12.6    2.43e-36
##  2 prep_time_min           -0.116      0.0136      -8.51   1.87e-17
##  3 daily_meals_frequency    2.48       0.357        6.95   3.73e-12
##  4 cholesterol_mg          -0.0144     0.00258     -5.57   2.59e- 8
##  5 protein_per_kg          -6.50       1.36        -4.77   1.86e- 6
##  6 sodium_mg               -0.00121    0.000314    -3.86   1.13e- 4
##  7 serving_size_g          -0.00629    0.00195     -3.23   1.24e- 3
##  8 diet_type                0.271      0.132        2.05   4.01e- 2
##  9 (Intercept)            752.       369.           2.04   4.15e- 2
## 10 cook_time_min           -0.00958    0.00674     -1.42   1.55e- 1
## 11 pct_carbs             -805.       738.          -1.09   2.75e- 1
## 12 carbs                    0.834      0.789        1.06   2.91e- 1
## 13 fats                    -1.85       1.85        -0.998  3.18e- 1
## 14 calories                 0.00108    0.00112      0.964  3.35e- 1
## 15 proteins                -0.762      0.872       -0.873  3.82e- 1
## 16 meal_type               -0.0603     0.200       -0.301  7.63e- 1
## 17 cal_balance             -0.000133   0.000450    -0.295  7.68e- 1
## 18 cooking_method          -0.0249     0.113       -0.220  8.26e- 1
## 19 cal_from_macros         NA         NA           NA     NA
tidy(lm_combined) %>%
    arrange(p.value)
## # A tibble: 33 × 5
##    term                   estimate std.error statistic  p.value
##    <chr>                     <dbl>     <dbl>     <dbl>    <dbl>
##  1 sets                    22.4       0.337      66.4  0       
##  2 reps                     2.23      0.0532     41.8  0       
##  3 sugar_g                 -0.162     0.0133    -12.2  4.77e-34
##  4 avg_bpm                 -0.121     0.0134     -9.03 1.81e-19
##  5 prep_time_min           -0.0937    0.0115     -8.14 4.02e-16
##  6 resting_bpm              0.187     0.0262      7.15 9.29e-13
##  7 protein_per_kg          -7.31      1.16       -6.32 2.67e-10
##  8 experience_level         3.29      0.571       5.75 8.95e- 9
##  9 session_duration_hours  -7.28      1.27       -5.74 9.40e- 9
## 10 daily_meals_frequency    1.40      0.302       4.62 3.89e- 6
## # ℹ 23 more rows
life %>%
    ggplot(aes(x = session_duration_hours, y = burns_calories_per_30min)) + geom_point(alpha = 0.3) +
    geom_smooth(method = "lm", formula = y ~ x, color = "red") + labs(title = "Linear Trend: Session Duration vs Calorie Burn",
    x = "Session Duration (hours)", y = "Calorie Burn per 30min") + theme_minimal()

plot_effect <- function(model, var, title_prefix = "") {
    eff <- effects::effect(var, model, xlevels = 20)
    df_eff <- as.data.frame(eff)

    ggplot(df_eff, aes_string(x = var, y = "fit")) + geom_line() + geom_ribbon(aes(ymin = lower,
        ymax = upper), alpha = 0.2) + labs(title = paste0(title_prefix, "Effect of ",
        var), x = var, y = "Predicted Calorie Burn per 30 min") + theme_minimal()
}

# examples:
plot_effect(lm_exer, "session_duration_hours", "Exercise Model: ")

plot_effect(lm_exer, "max_bpm", "Exercise Model: ")

plot_effect(lm_diet, "carbs", "Diet Model: ")

plot_effect(lm_diet, "calories", "Diet Model: ")

eff_carbs <- effects::effect("carbs", lm_diet, xlevels = 20)

eff_carbs_df <- as.data.frame(eff_carbs)

ggplot(eff_carbs_df, aes(x = carbs, y = fit)) + geom_line() + geom_ribbon(aes(ymin = lower,
    ymax = upper), alpha = 0.2) + labs(title = "Effect of Carbs on Calorie Burn (Diet Model)",
    x = "Carbs (g)", y = "Predicted Calorie Burn per 30 min") + theme_minimal()

summary(eff_carbs_df)
##      carbs            fit            se          lower         upper    
##  Min.   :140.0   Min.   : NA   Min.   : NA   Min.   : NA   Min.   : NA  
##  1st Qu.:217.5   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA   1st Qu.: NA  
##  Median :300.0   Median : NA   Median : NA   Median : NA   Median : NA  
##  Mean   :300.0   Mean   :NaN   Mean   :NaN   Mean   :NaN   Mean   :NaN  
##  3rd Qu.:382.5   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA   3rd Qu.: NA  
##  Max.   :460.0   Max.   : NA   Max.   : NA   Max.   : NA   Max.   : NA  
##                  NA's   :20    NA's   :20    NA's   :20    NA's   :20
head(eff_carbs_df)
##   carbs fit se lower upper
## 1   140  NA NA    NA    NA
## 2   150  NA NA    NA    NA
## 3   170  NA NA    NA    NA
## 4   190  NA NA    NA    NA
## 5   210  NA NA    NA    NA
## 6   220  NA NA    NA    NA
# turns out carbs are highly collinear with other diet variables

Machine Learning

# splitting the data
set.seed(400)

life_split <- initial_split(life, prop = 0.8)
life_train <- training(life_split)
life_test <- testing(life_split)
lm_exer_train <- lm(f_exer, data = life_train)
lm_diet_train <- lm(f_diet, data = life_train)
lm_combined_train <- lm(f_combined, data = life_train)
# testing
test_results <- tibble(model = c("exercise", "diet", "combined"), rmse = c(rmse_vec(life_test$burns_calories_per_30min,
    predict(lm_exer_train, life_test)), rmse_vec(life_test$burns_calories_per_30min,
    predict(lm_diet_train, life_test)), rmse_vec(life_test$burns_calories_per_30min,
    predict(lm_combined_train, life_test))))

test_results
## # A tibble: 3 × 2
##   model     rmse
##   <chr>    <dbl>
## 1 exercise  26.9
## 2 diet      31.2
## 3 combined  26.5